{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "processed_data_path = \"../processed_data/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = pd.read_csv(processed_data_path+\"X_train.csv\")\n",
    "X_test = pd.read_csv(processed_data_path+\"X_test.csv\")\n",
    "y_train = pd.read_csv(processed_data_path+\"y_train.csv\")\n",
    "y_test = pd.read_csv(processed_data_path+\"y_test.csv\")\n",
    "y_test = y_test.values.squeeze()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#print(X_train.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def score(y, pred):\n",
    "    TP = 0\n",
    "    FP = 0\n",
    "    FN = 0\n",
    "    TN = 0\n",
    "    for i in range(0, len(y)):\n",
    "        if int(pred[i]) == 1:\n",
    "            if int(y[i]) == 1:\n",
    "                TP = TP + 1\n",
    "            else:\n",
    "                FP = FP + 1\n",
    "        else:\n",
    "            if int(y[i]) == 1:\n",
    "                FN = FN + 1\n",
    "            else:\n",
    "                TN = TN + 1\n",
    "    return TP, TN, FP, FN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = X_train\n",
    "train_data['y'] = y_train\n",
    "\n",
    "#分离正负样本\n",
    "pos_train_data = train_data[train_data['y'] == 1]\n",
    "neg_train_data = train_data[train_data['y'] == 0]\n",
    "pos_train_data.reset_index(drop = True, inplace = True)\n",
    "neg_train_data.reset_index(drop = True, inplace = True)\n",
    "#new_count = n * pos_train_data['y'].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "#print(pos_train_data.reset_index().head())\n",
    "#print(pos_train_data.reset_index().describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.neighbors import NearestNeighbors\n",
    "import random\n",
    "\n",
    "def smote_up_sample(data,N):    \n",
    "    #近邻数k\n",
    "    k = 3\n",
    "    #上采样倍率N\n",
    "    N = 1\n",
    "    #选择用于评估样本点距离的特征\n",
    "    feature = ['age', 'duration', 'campaign', 'pdays', 'previous','emp.var.rate', 'cons.price.idx', 'cons.conf.idx','euribor3m', 'nr.employed']\n",
    "          \n",
    "    #feature = data.columns\n",
    "    #feature = ['duration','euribor3m','age','nr.employed','campaign','emp.var.rate', 'pdays','education', 'cons.price.idx ']\n",
    "\n",
    "    #new_data用于存放插值得到的样本\n",
    "    new_data = np.zeros((data.shape[0]*N,len(feature)))\n",
    "    newdf = data.copy()\n",
    "    #print(data.head())\n",
    "    neighbors = NearestNeighbors(n_neighbors=k).fit(data.loc[:,feature].values)\n",
    "    for i in range(data.shape[0]):\n",
    "        #print(data.shape[0]*N)\n",
    "        point = data.loc[i,feature]\n",
    "        #print(point.values.reshape(1, -1))\n",
    "        distances, indices = neighbors.kneighbors(point.values.reshape(1, -1))\n",
    "        nn = random.randint(1,k-1)\n",
    "        dis = data.loc[indices[0,nn],feature].values.reshape(1, -1) - data.loc[i,feature].values.reshape(1, -1)\n",
    "        gap = random.random()\n",
    "        new_data[i] = data.loc[i,feature].values.reshape(1, -1) + dis * gap\n",
    "        newdf.loc[i,feature] = new_data[i]             \n",
    "    data = data.append(newdf)\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#对正样本进行somte过采样\n",
    "#print(pos_train_data.head())\n",
    "pos_train_data = smote_up_sample(pos_train_data,N=2)\n",
    "\n",
    "#pos_train_data = smote_up_sample(pos_train_data)\n",
    "#简单复制正样本\n",
    "#pos_train_data = pd.concat([pos_train_data, pos_train_data],sort=False)\n",
    "#对负样本欠采样\n",
    "neg_train_data = neg_train_data.sample(frac = 0.5,random_state=42)\n",
    "#重新拼接正负样本\n",
    "train_data_sample = pd.concat([pos_train_data, neg_train_data],sort=False)\n",
    "\n",
    "X_train = train_data_sample.drop(['y'],axis =1)\n",
    "y_train = train_data_sample['y']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n#用PCA做降维处理\\nfrom sklearn.decomposition import PCA\\npca = PCA(n_components=45)\\npca.fit_transform(X_train)\\nX_train = pca.transform(X_train)\\nX_test = pca.transform(X_test)\\nX_val = pca.transform(X_val)\\nprint(X_train.shape)\\n'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "#用PCA做降维处理\n",
    "from sklearn.decomposition import PCA\n",
    "pca = PCA(n_components=45)\n",
    "pca.fit_transform(X_train)\n",
    "X_train = pca.transform(X_train)\n",
    "X_test = pca.transform(X_test)\n",
    "X_val = pca.transform(X_val)\n",
    "print(X_train.shape)\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.\n",
      "  from numpy.core.umath_tests import inner1d\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'\\nfrom xgboost import XGBClassifier\\nxgb = XGBClassifier()\\nxgb.fit(X_train, y_train) \\ny_train_pred = xgb.predict(X_train)\\ny_test_pred = xgb.predict(X_test)\\ny_val_pred = xgb.predict(X_val)\\n'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "dc = DecisionTreeClassifier()\n",
    "dc.fit(X_train, y_train) \n",
    "y_train_pred = dc.predict(X_train)\n",
    "y_test_pred = dc.predict(X_test)\n",
    "y_val_pred = dc.predict(X_val)\n",
    "\"\"\"\n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "rf = RandomForestClassifier(n_estimators=600,criterion = \"entropy\", oob_score=True)\n",
    "rf.fit(X_train, y_train) \n",
    "y_train_pred = rf.predict(X_train)\n",
    "y_test_pred = rf.predict(X_test)\n",
    "y_val_pred = rf.predict(X_val)\n",
    "\n",
    "\n",
    "\"\"\"\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "gbc = GradientBoostingClassifier(n_estimators=400, learning_rate=1.0,max_depth=1, random_state=0)\n",
    "gbc.fit(X_train, y_train)\n",
    "y_train_pred = gbc.predict(X_train)\n",
    "y_test_pred = gbc.predict(X_test)\n",
    "y_val_pred = gbc.predict(X_val)\n",
    "\"\"\"\n",
    "\n",
    "\"\"\"\n",
    "from xgboost import XGBClassifier\n",
    "xgb = XGBClassifier()\n",
    "xgb.fit(X_train, y_train) \n",
    "y_train_pred = xgb.predict(X_train)\n",
    "y_test_pred = xgb.predict(X_test)\n",
    "y_val_pred = xgb.predict(X_val)\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\ProgramData\\Anaconda3\\lib\\site-packages\\matplotlib\\figure.py:418: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure\n",
      "  \"matplotlib is currently using a non-GUI backend, \"\n"
     ]
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "#print(X_train.columns,rf.feature_importances_)\n",
    "f, ax = plt.subplots(figsize=(10, 5))\n",
    "plt.xticks(rotation=90)\n",
    "ax.bar(X_train.columns,rf.feature_importances_)\n",
    "ax.set_title(\"Feature Importances\")\n",
    "f.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train set:              precision    recall  f1-score   support\n",
      "\n",
      "        0.0       1.00      1.00      1.00     10956\n",
      "        1.0       1.00      1.00      1.00      5602\n",
      "\n",
      "avg / total       1.00      1.00      1.00     16558\n",
      "\n",
      "Valiation set:              precision    recall  f1-score   support\n",
      "\n",
      "        0.0       0.97      0.92      0.94      7296\n",
      "        1.0       0.55      0.80      0.65       942\n",
      "\n",
      "avg / total       0.92      0.90      0.91      8238\n",
      "\n",
      "Test set:              precision    recall  f1-score   support\n",
      "\n",
      "        0.0       0.97      0.92      0.94      7341\n",
      "        1.0       0.54      0.73      0.62       897\n",
      "\n",
      "avg / total       0.92      0.90      0.91      8238\n",
      "\n",
      "Percision：0.537082\n",
      "Recall：0.734671\n",
      "F1-score：0.620527\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import classification_report\n",
    "\n",
    "print(\"Train set:\", classification_report(y_train, y_train_pred))\n",
    "print(\"Valiation set:\", classification_report(y_val, y_val_pred))\n",
    "print(\"Test set:\", classification_report(y_test, y_test_pred))\n",
    "\n",
    "from sklearn.metrics import f1_score\n",
    "f1_score(y_test, y_test_pred)\n",
    "TP, TN, FP, FN = score(y_test, y_test_pred)\n",
    "P = TP / (TP + FP)\n",
    "R = TP / (TP + FN)\n",
    "F1 = (2 * P * R) / (P + R)\n",
    "\n",
    "print(\"Percision：%f\"%P)\n",
    "print(\"Recall：%f\"%R)\n",
    "print(\"F1-score：%f\"%F1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['feature_importances']\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 50 entries, age to day_of_week_wed\n",
      "Data columns (total 1 columns):\n",
      "feature_importances    50 non-null float64\n",
      "dtypes: float64(1)\n",
      "memory usage: 3.3+ KB\n",
      "None\n",
      "[6.58988575e-02 2.98112460e-02 5.29338702e-07 1.39759386e-02\n",
      " 9.90355584e-03 3.41928536e-01 4.05287550e-02 1.99306090e-02\n",
      " 1.02506931e-02 4.58624441e-02 2.84140974e-02 3.11403093e-02\n",
      " 1.02600714e-01 5.49982662e-02 5.14652300e-03 6.02284074e-03\n",
      " 1.45761450e-02 9.55790378e-03 8.49089225e-03 2.98737401e-03\n",
      " 2.55376215e-03 5.62499358e-03 4.40187486e-03 3.80258684e-03\n",
      " 5.41842372e-03 3.94564791e-03 8.18617969e-03 2.44272671e-03\n",
      " 1.23340748e-03 5.54850911e-03 9.65104208e-03 8.76951784e-03\n",
      " 4.20107581e-04 8.65373727e-03 8.98220171e-03 3.99644146e-03\n",
      " 2.17362321e-03 5.28607997e-04 2.43965917e-03 2.89268649e-03\n",
      " 3.79992890e-03 8.88346564e-03 2.37143492e-03 5.62303680e-03\n",
      " 1.21037019e-03 8.99945307e-03 9.21942491e-03 9.15439180e-03\n",
      " 8.21727867e-03 8.82924738e-03]\n",
      "                      feature_importances\n",
      "duration                     3.419285e-01\n",
      "euribor3m                    1.026007e-01\n",
      "age                          6.589886e-02\n",
      "nr.employed                  5.499827e-02\n",
      "emp.var.rate                 4.586244e-02\n",
      "campaign                     4.052876e-02\n",
      "cons.conf.idx                3.114031e-02\n",
      "education                    2.981125e-02\n",
      "cons.price.idx               2.841410e-02\n",
      "pdays                        1.993061e-02\n",
      "poutcome_success             1.457615e-02\n",
      "housing                      1.397594e-02\n",
      "previous                     1.025069e-02\n",
      "loan                         9.903556e-03\n",
      "marital_married              9.651042e-03\n",
      "job_admin.                   9.557904e-03\n",
      "day_of_week_mon              9.219425e-03\n",
      "day_of_week_thu              9.154392e-03\n",
      "day_of_week_fri              8.999453e-03\n",
      "contact_telephone            8.982202e-03\n",
      "month_may                    8.883466e-03\n",
      "day_of_week_wed              8.829247e-03\n",
      "marital_single               8.769518e-03\n",
      "contact_cellular             8.653737e-03\n",
      "job_blue-collar              8.490892e-03\n",
      "day_of_week_tue              8.217279e-03\n",
      "job_technician               8.186180e-03\n",
      "poutcome_nonexistent         6.022841e-03\n",
      "job_management               5.624994e-03\n",
      "month_oct                    5.623037e-03\n",
      "marital_divorced             5.548509e-03\n",
      "job_services                 5.418424e-03\n",
      "poutcome_failure             5.146523e-03\n",
      "job_retired                  4.401875e-03\n",
      "month_apr                    3.996441e-03\n",
      "job_student                  3.945648e-03\n",
      "job_self-employed            3.802587e-03\n",
      "month_mar                    3.799929e-03\n",
      "job_entrepreneur             2.987374e-03\n",
      "month_jun                    2.892686e-03\n",
      "job_housemaid                2.553762e-03\n",
      "job_unemployed               2.442727e-03\n",
      "month_jul                    2.439659e-03\n",
      "month_nov                    2.371435e-03\n",
      "month_aug                    2.173623e-03\n",
      "job_unknown                  1.233407e-03\n",
      "month_sep                    1.210370e-03\n",
      "month_dec                    5.286080e-04\n",
      "marital_unknown              4.201076e-04\n",
      "default                      5.293387e-07\n"
     ]
    }
   ],
   "source": [
    "#F_I = pd.DataFrame(rf.feature_importances_.reshape((1,-1)), columns = X_train.columns)\n",
    "F_I = pd.DataFrame(rf.feature_importances_.reshape((-1,1)), index = X_train.columns, columns = ['feature_importances'])\n",
    "print(F_I.columns.tolist())\n",
    "print(F_I.info())\n",
    "print(rf.feature_importances_)\n",
    "print(F_I.sort_values(by = 'feature_importances', axis = 0,ascending = False))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
